In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
gtd = pd.read_excel('GTD.xlsx')
In [3]:
gtd.head()
Out[3]:
In [4]:
gtd.region.value_counts()
Out[4]:
In [5]:
gtd = gtd.rename(columns={'iyear':'year', 'imonth':'month', 'iday':'day'})
In [6]:
#Get rid of zeroes in month and day
gtd = gtd[gtd.month != 0]
In [7]:
gtd = gtd[gtd.day != 0]
In [8]:
gtd.columns
Out[8]:
In [9]:
gtd.isnull().sum()
Out[9]:
In [10]:
gtd.attacktype1.value_counts()
Out[10]:
In [11]:
gtd.city.value_counts()
Out[11]:
In [12]:
gtd.region_txt.value_counts()
Out[12]:
In [13]:
# 1. Assassination
# 2. Armed assault
# 3. Bombing/explosion
# 4. Hijacking
# 5. Hostage Tacking (barricade incident)
# 6. Hostage Taking (kidnapping)
# 7. Facility/infrastructure attack
# 8. Unarmed assault
# 9. Unkown
In [14]:
me_attks = gtd[gtd.region_txt=='Middle East & North Africa']
In [15]:
ax = sns.distplot(me_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in the MENA Region')
plt.show()
In [16]:
s_am_attks = gtd[gtd.region_txt=='South America']
In [17]:
ax = sns.distplot(s_am_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in South America')
plt.show()
In [18]:
sa_attks = gtd[gtd.region_txt=='South Asia']
ax = sns.distplot(sa_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in South Asia')
plt.show()
In [19]:
we_attks = gtd[gtd.region_txt=='Western Europe']
ax = sns.distplot(we_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Western Europe')
plt.show()
In [20]:
sub_af_attks = gtd[gtd.region_txt=='Sub-Saharan Africa']
ax = sns.distplot(sub_af_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Sub Saharan Africa')
plt.show()
#Only one two show higher armed assaults than bombings.
In [21]:
sea_attks = gtd[gtd.region_txt=='Southeast Asia']
ax = sns.distplot(sea_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Southeast Asia')
plt.show()
In [22]:
cac_attks = gtd[gtd.region_txt=='Central America & Caribbean']
ax = sns.distplot(cac_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Central America and the Caribbean')
plt.show()
In [23]:
ee_attks = gtd[gtd.region_txt=='Eastern Europe']
ax = sns.distplot(ee_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Eastern Europe')
plt.show()
In [24]:
na_attks = gtd[gtd.region_txt=='North America']
ax = sns.distplot(na_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in North America')
plt.show()
In [25]:
ea_attks = gtd[gtd.region_txt=='East Asia']
ax = sns.distplot(ea_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in East Asia')
plt.show()
In [26]:
ca_attks = gtd[gtd.region_txt=='Central Asia']
ax = sns.distplot(ca_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Central Asia')
plt.show()
In [27]:
ao_attks = gtd[gtd.region_txt=='Australasia & Oceania']
ax = sns.distplot(ao_attks.attacktype1, kde=False, bins=9)
ax.set(xlabel='Attack Type', ylabel='Frequency in Australasia & Oceania')
plt.show()
In [28]:
# The most frequent attack across all regions is bombings, except for Sub-Saharan
# Africa and Central America/The Caribbean, where armed assualt is more relatively frequent.
# In North America, Australia/Oceania, and Western Europe, facility/infrastructure attacks
# are more frequent than in other areas. This makes sense, as these regions are generally
# more developed than others in terms of urban infrastructure.
In [29]:
me_attks.groupby('attacktype1').year.count()
Out[29]:
In [30]:
me_attks['armed_assault'] = me_attks.apply(lambda x: 1 if x['attacktype1'] == 2 else 0, axis=1)
me_attks['bombings'] = me_attks.apply(lambda x: 1 if x['attacktype1'] == 3 else 0, axis=1)
In [31]:
me_attks.head()
Out[31]:
In [32]:
me_attks_grouped_by_year = me_attks.groupby('year').sum()
In [33]:
me_attks_grouped_by_year = me_attks_grouped_by_year.reset_index()
In [34]:
me_attks_grouped_by_year.head()
Out[34]:
In [35]:
me_bombs_1st_q = me_attks_grouped_by_year[me_attks_grouped_by_year.bombings <= 174]
me_bombs_4th_q = me_attks_grouped_by_year[me_attks_grouped_by_year.bombings >= 269]
In [36]:
me_bombs_1st_q.nkill.describe()
Out[36]:
In [37]:
me_attks_grouped_by_year.bombings.describe()
Out[37]:
In [38]:
me_aa_1st_q = me_attks_grouped_by_year[me_attks_grouped_by_year.armed_assault <= 35]
me_aa_4th_q = me_attks_grouped_by_year[me_attks_grouped_by_year.armed_assault >= 204]
In [39]:
me_attks_grouped_by_year.armed_assault.describe()
Out[39]:
In [40]:
plt.style.use('fivethirtyeight')
In [41]:
ax = sns.distplot(me_bombs_1st_q.bombings)
sns.distplot(me_bombs_4th_q.bombings)
ax.set(xlabel='Distribtuons of Bombings in Each Quartile')
plt.show()
In [42]:
ax = sns.regplot(x=me_bombs_4th_q.bombings, y=me_bombs_4th_q.nkill, data=me_bombs_4th_q)
ax.set(xlabel='Fourth Quartile Bombings in MENA', ylabel='Number Killed')
plt.show()
In [43]:
ax = sns.distplot(me_aa_1st_q.armed_assault)
sns.distplot(me_aa_4th_q.armed_assault)
ax.set(xlabel='Distribtuons of Bombings in Each Quartile for Armed Assault in MENA')
plt.show()
In [44]:
ax = sns.regplot(x=me_aa_1st_q.armed_assault, y=me_aa_1st_q.nkill, data=me_aa_1st_q)
ax.set(xlabel='First Quartile Armed Assaults in MENA', ylabel='Number Killed')
plt.show()
In [45]:
ax = sns.regplot(x=me_aa_4th_q.armed_assault, y=me_aa_4th_q.nkill, data=me_aa_4th_q)
ax.set(xlabel='Fourth Quartile Armed Assaults in MENA', ylabel='Number Killed')
plt.show()
In [48]:
ax = sns.regplot(x=sub_af_aa_4th_q.armed_assault, y=sub_af_aa_4th_q.nkill, data=sub_af_aa_4th_q)
ax.set(xlabel='Fourth Quartile Armed Assaults in Sub Saharan Africa', ylabel='Number Killed')
plt.show()
In [49]:
sub_af_attks_grouped_by_year.armed_assault.describe()
In [50]:
me_bombs_1st_q.columns
Out[50]:
In [51]:
# Plotting armed assault across time in the Middle East and Sub-Saharan Africa:
ax = sns.regplot(x=me_attks_grouped_by_year.year, y=me_attks_grouped_by_year.armed_assault, data=me_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Attacks in MENA Region')
plt.title('Armed Assaults in the MENA Region')
plt.show()
In [52]:
sub_af_attks['armed_assault'] = sub_af_attks.apply(lambda x: 1 if x['attacktype1'] == 2 else 0, axis=1)
sub_af_attks['bombings'] = sub_af_attks.apply(lambda x: 1 if x['attacktype1'] == 3 else 0, axis=1)
sub_af_attks_grouped_by_year = sub_af_attks.groupby('year').sum()
sub_af_attks_grouped_by_year = sub_af_attks_grouped_by_year.reset_index()
In [53]:
ax = sns.regplot(x=sub_af_attks_grouped_by_year.year, y=sub_af_attks_grouped_by_year.armed_assault, data=sub_af_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Armed Assaults per year')
plt.title('Armed Assaults in Sub-Saharan Africa')
plt.show()
In [54]:
# Plotting bombings across years in the Middle East and Sub-Saharan Africa:
ax = sns.regplot(x=me_attks_grouped_by_year.year, y=me_attks_grouped_by_year.bombings, data=me_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Bombings per year')
plt.title('Bombings Each Year in the MENA Region')
plt.show()
In [55]:
ax = sns.regplot(x=sub_af_attks_grouped_by_year.year, y=sub_af_attks_grouped_by_year.bombings, data=sub_af_attks_grouped_by_year)
ax.set(xlabel='Year', ylabel='Bombings per year')
plt.title('Bombings Each Year in Sub-Saharan Africa')
plt.show()
In [56]:
# Plotting Number killed in each region.
ax = sns.regplot(x=me_attks_grouped_by_year.year, y=me_attks_grouped_by_year.nkill, data=me_attks_grouped_by_year)
ax.set(xlabel='1970 to 2015', ylabel='Number Killed Each Year in the MENA Region')
plt.show()
In [57]:
ax = sns.regplot(x=sub_af_attks_grouped_by_year.year, y=sub_af_attks_grouped_by_year.nkill, data=sub_af_attks_grouped_by_year)
ax.set(xlabel='1970 to 2015', ylabel='Number Killed Each Year in Sub Saharan Africa')
plt.show()
In [58]:
#gtd['date'] = pd.to_datetime((gtd.year*10000+gtd_ts.month*100+gtd.day).apply(str),format='%Y%m%d')
In [59]:
gtd.country_txt.value_counts()
Out[59]:
In [ ]:
#Let's compare the top five in Sub-Saharan Africa and top five in the Middle East.
In [60]:
sub_af_attks.country_txt.value_counts().head()
Out[60]:
In [61]:
me_attks.country_txt.value_counts().head()
Out[61]:
In [62]:
nigeria_attks = sub_af_attks[sub_af_attks.country_txt=='Nigeria']
ax = sns.distplot(nigeria_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Nigeria')
plt.show()
In [63]:
somalia_attks = sub_af_attks[sub_af_attks.country_txt=='Somalia']
ax = sns.distplot(somalia_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Somalia')
plt.show()
# Much higher frequency of bombings in Somalia than in Nigeria.
In [64]:
s_af_attks = sub_af_attks[sub_af_attks.country_txt=='South Africa']
ax = sns.distplot(s_af_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in South Africa')
plt.show()
In [65]:
sudan_attks = sub_af_attks[sub_af_attks.country_txt=='Sudan']
ax = sns.distplot(sudan_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Sudan')
plt.show()
In [66]:
kenya_attks = sub_af_attks[sub_af_attks.country_txt=='Kenya']
ax = sns.distplot(kenya_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Kenya')
plt.show()
# Relatively equal bombings and armed assault.
In [67]:
iraq_attks = me_attks[me_attks.country_txt=='Iraq']
ax = sns.distplot(me_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Iraq')
plt.show()
In [68]:
turkey_attks = me_attks[me_attks.country_txt=='Turkey']
ax = sns.distplot(turkey_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Turkey')
plt.show()
In [69]:
algeria_attks = me_attks[me_attks.country_txt=='Algeria']
ax = sns.distplot(algeria_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Algeria')
plt.show()
In [70]:
yemen_attks = me_attks[me_attks.country_txt=='Yemen']
ax = sns.distplot(yemen_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Yemen')
plt.show()
In [71]:
leb_attks = me_attks[me_attks.country_txt=='Lebanon']
ax = sns.distplot(leb_attks.attacktype1, kde=False)
ax.set(xlabel='Attack Type', ylabel='Frequency in Lebanon')
plt.show()
In [ ]:
In [72]:
# compare two populations.
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import pymc3 as pm
In [73]:
sub_af_attks['armed_assault'] = sub_af_attks.apply(lambda x: 1 if x['attacktype1'] == 2 else 0, axis=1)
In [74]:
#Comparing populations in Somalia and Nigeria in the 2010s.
sub_af_00s = sub_af_attks[sub_af_attks.year > 2001]
In [75]:
#Comparing populations in Somalia and Nigeria in the 2010s.
sub_af_00s = sub_af_attks[sub_af_attks.year > 2001]
sub_af_00s = sub_af_00s[sub_af_00s.year < 2011]
In [76]:
nig_00s = sub_af_00s[sub_af_00s.country_txt == 'Nigeria']
som_00s = sub_af_00s[sub_af_00s.country_txt == 'Somalia']
In [77]:
nig_00s
Out[77]:
In [78]:
nig_00s.columns
Out[78]:
In [79]:
nig_00s = nig_00s[[0,1,2,4,8,12,13,17,18,19,66]]
In [80]:
som_00s = som_00s[[0,1,2,4,8,12,13,17,18,19,66]]
In [81]:
mean_prior_mean = sub_af_00s.armed_assault.mean()
mean_prior_std = sub_af_00s.armed_assault.std()
In [82]:
with pm.Model() as model:
nig_mean = pm.Normal('Nigeria Armed Assaults Mean', mean_prior_mean, sd=mean_prior_std)
som_mean = pm.Normal('Somalia Armed Assaults Mean', mean_prior_mean, sd=mean_prior_std)
In [83]:
std_prior_lower = 0.01
std_prior_upper = 100.0
with model:
nig_std = pm.Uniform('Nigeria Armed Assaults Std', lower=std_prior_lower, upper=std_prior_upper)
som_std = pm.Uniform('Somalia Armed Assaults Std', lower=std_prior_lower, upper=std_prior_upper)
In [84]:
with model:
nigeria_group = pm.Normal('Nigeria Armed Assaults', mu=nig_mean, sd=nig_std, observed=nig_00s)
som_group = pm.Normal('Somalia Armed Assaults', mu=som_mean, sd=som_std, observed=som_00s)
In [85]:
with model:
diff_of_means = pm.Deterministic('difference of means', nig_mean - som_mean)
diff_of_stds = pm.Deterministic('difference of stds', nig_std - som_std)
effect_size = pm.Deterministic('effect size',
diff_of_means / np.sqrt((nig_std**2 + som_std**2) / 2))
In [103]:
with model:
trace = pm.sample(10000, njobs=-1)
In [104]:
pm.plot_posterior(trace[3000:],
varnames=['Nigeria Armed Assaults Mean', 'Somalia Armed Assaults Mean', 'Nigeria Armed Assaults Std', 'Somalia Armed Assaults Std'],
color='#87ceeb')
Out[104]:
In [105]:
pm.plot_posterior(trace[3000:],
varnames=['difference of means', 'difference of stds', 'effect size'],
ref_val=0,
color='#87ceeb')
Out[105]:
In [107]:
pm.summary(trace[3000:],
varnames=['difference of means', 'difference of stds', 'effect size'])
In [108]:
#Drop the 73 null values for lat and long.
In [109]:
# Predicting for 1993. Might be best to look at the years before and after.
me_attks_b4 = me_attks[me_attks.year == 1992]
me_attks_aftr = me_attks[me_attks.year == 1994]
sub_af_attks_b4 = sub_af_attks[sub_af_attks.year == 1992]
sub_af_attks_aftr = sub_af_attks[sub_af_attks.year == 1994]
In [110]:
sub_af_attks_b4.head()
Out[110]:
In [111]:
sub_af_attks_aftr.head()
Out[111]:
In [112]:
bombs_per_month_1 = sub_af_attks_b4.groupby('month').bombings.count()
In [113]:
bombs_per_month_2 = sub_af_attks_aftr.groupby('month').bombings.count()
In [114]:
bombs_per_month_93 = np.mean(np.array([bombs_per_month_1, bombs_per_month_2]), axis=0 )
In [115]:
bombs_per_month_93 = pd.DataFrame(bombs_per_month_93)
In [116]:
bombs_per_month_93
Out[116]:
In [117]:
bombs_per_country_1 = sub_af_attks_b4.groupby('country_txt').bombings.count()
In [118]:
bombs_per_country_2 = sub_af_attks_aftr.groupby('country_txt').bombings.count()
In [119]:
bombs_per_country_1 = pd.DataFrame(bombs_per_country_1)
In [120]:
bombs_per_country_2 = pd.DataFrame(bombs_per_country_2)
In [121]:
bombs_per_county_93 = pd.concat((bombs_per_country_1, bombs_per_country_2))
bombs_per_county_93.groupby(bombs_per_county_93.index).mean()
Out[121]:
In [ ]: